- to set indents/bulltet points<u> around a word to underlinebackground-color and color to highlight text with different colours text.<span>
{width=100% height=100%}code_folding: hideeval and includedata(mtcars) # this is an inbuilt dataset in R - you'll be working with a few more of these in Assignment 3
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# Load the ggplot2 package
library(ggplot2)
# Create the scatter plot
ggplot(mtcars, aes(x = hp, y = cyl)) +
geom_point() +
labs(title = "Scatterplot of Horsepower (hp) vs. Cylinders (cyl)",
x = "Horsepower (hp)",
y = "Number of Cylinders (cyl)") +
theme_minimal()
# when eval=FALSE, this code chunk will not be evaulated/processed by R
data(mtcars) # this is an inbuilt dataset in R - you'll be working with a few more of these in Assignment 3
head(mtcars)
# a vector of numbers
v <- c(1, 2, 3, 4) # Numeric vector
v
## [1] 1 2 3 4
which(v == 2) # can be queried
## [1] 2
v = v*10 # and modified
v_char <- c("A", "B", "C") # can be numeric or character, but not both
v_char
## [1] "A" "B" "C"
v_char[3] # access the third element in this vector
## [1] "C"
# a list
lst <- list(10, "hello", TRUE, c(1, 2, 3))
lst # can contain different types of elements, numbers, characters, logical (TRUE/FALSE)
## [[1]]
## [1] 10
##
## [[2]]
## [1] "hello"
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] 1 2 3
lst[[4]] # access the fourth item in this list
## [1] 1 2 3
lst[[4]][2] # access the second element of the fourth item in this list
## [1] 2
source: ChatGPT
# this is a matrix
matrix(sample(1:1000, 100), 10,10)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 34 764 602 71 468 984 176 418 332 817
## [2,] 170 696 716 382 265 702 459 992 597 206
## [3,] 41 913 327 514 341 762 271 264 744 471
## [4,] 937 810 753 458 29 425 620 404 33 430
## [5,] 105 38 688 369 165 395 986 563 450 234
## [6,] 451 900 75 427 982 440 860 581 226 148
## [7,] 36 320 847 951 601 295 924 238 878 313
## [8,] 89 855 821 172 273 695 377 103 80 263
## [9,] 889 543 685 296 44 962 717 861 396 478
## [10,] 679 481 115 392 719 966 862 335 991 490
# this is a dataframe
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# Download files for this workshop
download.file(
"https://monashdatafluency.github.io/r-intro-2/r-intro.zip",
destfile="r-intro.zip")
unzip("r-intro.zip")
# Install Tidyverse
install.packages("tidyverse")
<chr> - character strings<dbl> - numerical values. Technically these are
“doubles”, which is a way of storing numbers with 15 digits
precision.<lgl> - logical values, TRUE or FALSE.<int> - integers, a fancy name for whole
numbers.<fct> - factors, categorical data. We will get to
this shortly.geo
## # A tibble: 196 × 7
## name region oecd g77 lat long income2017
## <chr> <chr> <lgl> <lgl> <dbl> <dbl> <chr>
## 1 Australia asia TRUE FALSE -25 135 high
## 2 Brunei asia FALSE TRUE 4.5 115. high
## 3 Cambodia asia FALSE TRUE 13 105 lower_mid
## 4 China asia FALSE TRUE 35 105 upper_mid
## 5 Fiji asia FALSE TRUE -18 178 upper_mid
## 6 Hong Kong, China asia FALSE FALSE 22.3 114. high
## 7 Indonesia asia FALSE TRUE -5 120 lower_mid
## 8 Japan asia TRUE FALSE 35.7 140. high
## 9 Kiribati asia FALSE FALSE 1.42 173. lower_mid
## 10 North Korea asia FALSE TRUE 40 127 low
## # ℹ 186 more rows
head(geo2)
## name region oecd g77 lat long income2017
## 1 Australia asia TRUE FALSE -25.00000 135.0000 high
## 2 Brunei asia FALSE TRUE 4.50000 114.6667 high
## 3 Cambodia asia FALSE TRUE 13.00000 105.0000 lower_mid
## 4 China asia FALSE TRUE 35.00000 105.0000 upper_mid
## 5 Fiji asia FALSE TRUE -18.00000 178.0000 upper_mid
## 6 Hong Kong, China asia FALSE FALSE 22.28552 114.1577 high
summary(geo)
## name region oecd g77
## Length:196 Length:196 Mode :logical Mode :logical
## Class :character Class :character FALSE:165 FALSE:65
## Mode :character Mode :character TRUE :31 TRUE :131
##
##
##
## lat long income2017
## Min. :-42.00 Min. :-175.000 Length:196
## 1st Qu.: 4.00 1st Qu.: -5.625 Class :character
## Median : 17.42 Median : 21.875 Mode :character
## Mean : 19.03 Mean : 23.004
## 3rd Qu.: 39.82 3rd Qu.: 51.892
## Max. : 65.00 Max. : 179.145
table(geo$region)
##
## africa americas asia europe
## 54 35 59 48
geo_asia = geo[c(which(geo$region == "asia")),] # this searches for all asia labels in the region column
head(geo_asia); dim(geo_asia)
## # A tibble: 6 × 7
## name region oecd g77 lat long income2017
## <chr> <chr> <lgl> <lgl> <dbl> <dbl> <chr>
## 1 Australia asia TRUE FALSE -25 135 high
## 2 Brunei asia FALSE TRUE 4.5 115. high
## 3 Cambodia asia FALSE TRUE 13 105 lower_mid
## 4 China asia FALSE TRUE 35 105 upper_mid
## 5 Fiji asia FALSE TRUE -18 178 upper_mid
## 6 Hong Kong, China asia FALSE FALSE 22.3 114. high
## [1] 59 7
geo2[1:2,1:2] # pull out rows 1:2 and columns 1:2
## name region
## 1 Australia asia
## 2 Brunei asia
filter(geo, lat < 0 & oecd)
## # A tibble: 3 × 7
## name region oecd g77 lat long income2017
## <chr> <chr> <lgl> <lgl> <dbl> <dbl> <chr>
## 1 Australia asia TRUE FALSE -25 135 high
## 2 New Zealand asia TRUE FALSE -42 174 high
## 3 Chile americas TRUE TRUE -33.5 -70.6 high
geo_filter = filter(geo, lat < 0 & oecd)
dim(geo_filter)
## [1] 3 7
arrange(geo, lat)
## # A tibble: 196 × 7
## name region oecd g77 lat long income2017
## <chr> <chr> <lgl> <lgl> <dbl> <dbl> <chr>
## 1 New Zealand asia TRUE FALSE -42 174 high
## 2 Argentina americas FALSE TRUE -34 -64 upper_mid
## 3 Chile americas TRUE TRUE -33.5 -70.6 high
## 4 Uruguay americas FALSE TRUE -33 -56 high
## 5 Lesotho africa FALSE TRUE -29.5 28.2 lower_mid
## 6 South Africa africa FALSE TRUE -29 24 upper_mid
## 7 Swaziland africa FALSE TRUE -26.5 31.5 lower_mid
## 8 Australia asia TRUE FALSE -25 135 high
## 9 Paraguay americas FALSE TRUE -23.3 -58 upper_mid
## 10 Botswana africa FALSE TRUE -22 24 upper_mid
## # ℹ 186 more rows
geo[order(geo$lat),]
## # A tibble: 196 × 7
## name region oecd g77 lat long income2017
## <chr> <chr> <lgl> <lgl> <dbl> <dbl> <chr>
## 1 New Zealand asia TRUE FALSE -42 174 high
## 2 Argentina americas FALSE TRUE -34 -64 upper_mid
## 3 Chile americas TRUE TRUE -33.5 -70.6 high
## 4 Uruguay americas FALSE TRUE -33 -56 high
## 5 Lesotho africa FALSE TRUE -29.5 28.2 lower_mid
## 6 South Africa africa FALSE TRUE -29 24 upper_mid
## 7 Swaziland africa FALSE TRUE -26.5 31.5 lower_mid
## 8 Australia asia TRUE FALSE -25 135 high
## 9 Paraguay americas FALSE TRUE -23.3 -58 upper_mid
## 10 Botswana africa FALSE TRUE -22 24 upper_mid
## # ℹ 186 more rows
gap <- read_csv("r-intro/gap-minder.csv")
## Rows: 4312 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): name
## dbl (4): year, population, gdp_percap, life_exp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gap
## # A tibble: 4,312 × 5
## name year population gdp_percap life_exp
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 1800 3280000 603 28.2
## 2 Albania 1800 410445 667 35.4
## 3 Algeria 1800 2503218 715 28.8
## 4 Andorra 1800 2654 1197 NA
## 5 Angola 1800 1567028 618 27.0
## 6 Antigua and Barbuda 1800 37000 757 33.5
## 7 Argentina 1800 534000 1507 33.2
## 8 Armenia 1800 413326 514 34
## 9 Australia 1800 351014 814 34.0
## 10 Austria 1800 3205587 1847 34.4
## # ℹ 4,302 more rows
gap_geo <- left_join(gap, geo, by="name")
gap_geo
## # A tibble: 4,312 × 11
## name year population gdp_percap life_exp region oecd g77 lat long
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <lgl> <lgl> <dbl> <dbl>
## 1 Afghani… 1800 3280000 603 28.2 asia FALSE TRUE 33 66
## 2 Albania 1800 410445 667 35.4 europe FALSE FALSE 41 20
## 3 Algeria 1800 2503218 715 28.8 africa FALSE TRUE 28 3
## 4 Andorra 1800 2654 1197 NA europe FALSE FALSE 42.5 1.52
## 5 Angola 1800 1567028 618 27.0 africa FALSE TRUE -12.5 18.5
## 6 Antigua… 1800 37000 757 33.5 ameri… FALSE TRUE 17.0 -61.8
## 7 Argenti… 1800 534000 1507 33.2 ameri… FALSE TRUE -34 -64
## 8 Armenia 1800 413326 514 34 europe FALSE FALSE 40.2 45
## 9 Austral… 1800 351014 814 34.0 asia TRUE FALSE -25 135
## 10 Austria 1800 3205587 1847 34.4 europe TRUE FALSE 47.3 13.3
## # ℹ 4,302 more rows
## # ℹ 1 more variable: income2017 <chr>
library(tidyverse)
geo <- read_csv("r-intro/geo.csv")
gap <- read_csv("r-intro/gap-minder.csv")
gap_geo <- left_join(gap, geo, by="name")
ggplot(gap_geo, aes(x=year, y=life_exp)) +
geom_point()
plot(gap_geo$year, gap_geo$life_exp, pch=16, xlab="year", ylab="life expectancy")
plot(gap_geo$year, gap_geo$life_exp, pch=16, xlab="year", ylab="life expectancy", col="#00000050")
ggplot(gap_geo, aes(x=year, y=life_exp, color=region, size=population)) +
geom_point()
ggplot(gap_geo, aes(x=year, y=life_exp, group=name, color=region)) +
geom_line()
ggplot(gap_geo, aes(x=year, y=life_exp, group=year)) +
geom_violin()
ggplot(gap_geo, aes(x=year, y=life_exp)) +
geom_point() +
geom_smooth()
ggplot(gap_geo, aes(x=year, y=life_exp)) +
geom_line(aes(group=name)) +
geom_smooth(aes(color=oecd))
https://rviews.rstudio.com/2017/09/25/survival-analysis-with-r/
library(survival)
library(ranger)
library(ggplot2)
library(dplyr)
library(ggfortify)
library(ggsurvfit)
data(veteran)
head(veteran)
## trt celltype time status karno diagtime age prior
## 1 1 squamous 72 1 60 7 69 0
## 2 1 squamous 411 1 70 5 64 10
## 3 1 squamous 228 1 60 3 38 0
## 4 1 squamous 126 1 60 9 63 10
## 5 1 squamous 118 1 70 11 65 10
## 6 1 squamous 10 1 20 5 49 0
dim(veteran)
## [1] 137 8
km <- with(veteran, Surv(time, status))
head(km,80)
## [1] 72 411 228 126 118 10 82 110 314 100+ 42 8 144 25+ 11
## [16] 30 384 4 54 13 123+ 97+ 153 59 117 16 151 22 56 21
## [31] 18 139 20 31 52 287 18 51 122 27 54 7 63 392 10
## [46] 8 92 35 117 132 12 162 3 95 177 162 216 553 278 12
## [61] 260 200 156 182+ 143 105 103 250 100 999 112 87+ 231+ 242 991
## [76] 111 1 587 389 33
km_fit <- survfit(Surv(time, status) ~ 1, data=veteran)
summary(km_fit, times = c(1,30,60,90*(1:10)))
## Call: survfit(formula = Surv(time, status) ~ 1, data = veteran)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 137 2 0.985 0.0102 0.96552 1.0000
## 30 97 39 0.700 0.0392 0.62774 0.7816
## 60 73 22 0.538 0.0427 0.46070 0.6288
## 90 62 10 0.464 0.0428 0.38731 0.5560
## 180 27 30 0.222 0.0369 0.16066 0.3079
## 270 16 9 0.144 0.0319 0.09338 0.2223
## 360 10 6 0.090 0.0265 0.05061 0.1602
## 450 5 5 0.045 0.0194 0.01931 0.1049
## 540 4 1 0.036 0.0175 0.01389 0.0934
## 630 2 2 0.018 0.0126 0.00459 0.0707
## 720 2 0 0.018 0.0126 0.00459 0.0707
## 810 2 0 0.018 0.0126 0.00459 0.0707
## 900 2 0 0.018 0.0126 0.00459 0.0707
autoplot(km_fit)
km_trt_fit <- survfit(Surv(time, status) ~ trt, data=veteran)
autoplot(km_trt_fit)
# compare this to ggplot survival plot
survfit2(Surv(time, status) ~ trt, data = veteran) %>%
ggsurvfit() +
labs(
x = "Days",
y = "Overall survival probability"
)
vet <- mutate(veteran, AG = ifelse((age < 60), "LT60", "OV60"),
AG = factor(AG),
trt = factor(trt,labels=c("standard","test")),
prior = factor(prior,labels=c("N0","Yes")))
km_AG_fit <- survfit(Surv(time, status) ~ AG, data=vet)
autoplot(km_AG_fit)
F = read.delim("~/Desktop/GDCdata/clinical.cohort.2025-03-19/follow_up.tsv", sep = "\t", header = TRUE, fill = TRUE) # read in the follow_up.tsv file
length(which(!duplicated(F$case_id))) # 5949 - so a bit less than 7625
## [1] 5949
C = read.delim("~/Desktop/GDCdata/clinical.cohort.2025-03-19/clinical.tsv", sep = "\t", header = TRUE, fill = TRUE) # read in the clinical.tsv file
length(which(!duplicated(C$case_id))) # 7625
## [1] 7625
# extract days to death data from 'days_to_death' column
Cdtd=C[,c("case_id","days_to_death")]
Cdtd=Cdtd[c(which(Cdtd$days_to_death != "'--")),]
head(Cdtd)
## case_id days_to_death
## 10 0079b2ff-6c93-47b0-95cb-4cc9b8915e12 1270
## 28 0094e07c-1595-402e-9d38-68b9cac71e7b 1753
## 29 0094e07c-1595-402e-9d38-68b9cac71e7b 1753
## 32 00b9bfd0-95c0-59c6-afa4-44d13563a7e6 17
## 33 00b9bfd0-95c0-59c6-afa4-44d13563a7e6 17
## 34 00b9bfd0-95c0-59c6-afa4-44d13563a7e6 17
length(which(duplicated(Cdtd$case_id)))
## [1] 3866
Cdtd=Cdtd[c(which(!duplicated(Cdtd$case_id))),]
dim(Cdtd)
## [1] 1496 2
#hist(Cdtd$days_to_death)
hist(as.numeric(Cdtd$days_to_death),breaks=100) # lets look at a histogram of days until death
# now extract days to last follow-up from the days_to_last_follow_up column
# this will be used to code follow-up time for individuals who were still alive at the end of the observed follow-up
Cfu=C[,c("case_id","days_to_last_follow_up")] # just extract the variables that we need
Cfu=Cfu[c(which(Cfu$days_to_last_follow_up != "'--")),] # need to remove rows with this 'filler'
dim(Cfu)
## [1] 7866 2
head(Cfu)
## case_id days_to_last_follow_up
## 4 005c990a-3b38-521a-b849-0bd12b23043e 917.0
## 7 006a93d5-096f-5c34-9948-c934157fdc58 3853.0
## 9 0074779b-5919-54ff-89f5-a26f42d7d0d9 3259.0
## 12 0084e8b6-57fc-48b6-aa77-fec6e45161d2 832.0
## 13 0084e8b6-57fc-48b6-aa77-fec6e45161d2 832.0
## 14 0084e8b6-57fc-48b6-aa77-fec6e45161d2 832.0
Cfu=Cfu[c(which(!duplicated(Cfu$days_to_last_follow_up))),] # this file includes alot of duplicates to help 'pad' for other variables that may have more than one level recorded for one patient, i.e. they will have multiple rows which cause duplicates to appear for this variable that need to be removed.
Ffu=F[,c("case_id","days_to_follow_up")] # just extract variables we need
Ffu=Ffu[c(which(Ffu$days_to_follow_up != "'--")),] # need to remove rows with this 'filler'
Ffu=Ffu[c(which(!duplicated(Ffu$case_id))),] # need to remove duplicated rows
# now we can merge the death and follow-up variables together, using the case Id as the merge variable
M=merge(Ffu,Cfu,by="case_id",all=TRUE)
M=merge(M,Cdtd,by="case_id",all=TRUE)
# you'll see that we actually merged 3 different extracted variables, days_to_death, days_to_last_follow_up and days_to_follow_up
M[1:30,]
## case_id days_to_follow_up
## 1 00318ddd-89bc-4a35-af79-5ab407fbd278 2976
## 2 005c990a-3b38-521a-b849-0bd12b23043e <NA>
## 3 00614f9b-1f5f-5037-bd29-bb0bb2c82426 3284
## 4 006a93d5-096f-5c34-9948-c934157fdc58 <NA>
## 5 0074779b-5919-54ff-89f5-a26f42d7d0d9 <NA>
## 6 0079b2ff-6c93-47b0-95cb-4cc9b8915e12 1270
## 7 007ddc83-31c6-59f1-a093-a063b4806f96 2649
## 8 0084e8b6-57fc-48b6-aa77-fec6e45161d2 832
## 9 008ddf20-f7fb-4673-b2ed-b5d5212fbf09 59
## 10 0094e07c-1595-402e-9d38-68b9cac71e7b 494
## 11 00a65c23-8fa2-46b1-8cff-78652fa8e4ab 439
## 12 00b9bfd0-95c0-59c6-afa4-44d13563a7e6 17
## 13 00c0941a-ac70-5bb3-9777-c66e02d56138 208
## 14 00d795da-4d30-54dd-948d-3d2b3aba7beb 166
## 15 00d973d4-9eb1-4e77-a02b-ce4d954f9624 2077
## 16 00e49a81-96ab-5f9b-b28c-4bc9e9e65544 2055
## 17 00e9cfa7-d2e1-46b8-b0ff-d7f1da925097 1149
## 18 00ef4e4f-a44b-467c-923c-3c001abdaa5d 3625
## 19 00f67d32-46c5-43f2-8356-01bdc988bfaf 108
## 20 0113b21f-c0d9-5e77-89b1-57a78f0f9a1e 1
## 21 0116cc2c-50b9-4284-a257-386262715003 0
## 22 01189d9f-1be2-5db6-a45d-941d0ef3b1bb 3870
## 23 01205bd4-0e26-4d7d-811b-f51190bd2fa9 1344
## 24 0124279c-6b9f-5dbd-84e1-b6307f104021 3553
## 25 013f0b1d-8e7c-438c-b561-289a629c2962 2579
## 26 01489f5a-650e-4e15-aacf-a8463cbed03c 376
## 27 0165ada0-f393-4ffb-8f80-6d87f830584d 49
## 28 0175f673-c1d7-4d3c-b799-32c4110a351c 386
## 29 0175f734-c477-59e4-8eb9-cf12c4a0d078 <NA>
## 30 01795bc6-adc0-40a2-9880-6c99b93c0c5a 2677
## days_to_last_follow_up days_to_death
## 1 <NA> <NA>
## 2 917.0 <NA>
## 3 <NA> <NA>
## 4 3853.0 <NA>
## 5 3259.0 <NA>
## 6 <NA> 1270
## 7 <NA> <NA>
## 8 832.0 <NA>
## 9 <NA> <NA>
## 10 1753.0 1753
## 11 <NA> <NA>
## 12 <NA> 17
## 13 <NA> <NA>
## 14 <NA> 166
## 15 <NA> <NA>
## 16 <NA> <NA>
## 17 1828.0 <NA>
## 18 <NA> <NA>
## 19 <NA> 230
## 20 <NA> 1
## 21 <NA> <NA>
## 22 <NA> <NA>
## 23 <NA> <NA>
## 24 <NA> <NA>
## 25 <NA> <NA>
## 26 425.0 425
## 27 877.0 <NA>
## 28 <NA> <NA>
## 29 533.0 533
## 30 <NA> <NA>
# because we have 3 columns of data, will write a loop to pull out 'time' (time to death or last followup) and 'status' (dead/alive at that time)
# this loop will first identify those that have died...
# ...then if the individual did not die, look in the days_to_last_follow_up column for a day value
# If there is no day value in the days_to_death or days_to_last_follow_up column, look in the days_to_follow_up column
M$time=NA
M$status=0
for(i in 1:nrow(M)) {
if(!is.na(M$days_to_death[i])) {
M$time[i]=M$days_to_death[i]
M$status[i]=1
} else {
if(!is.na(M$days_to_last_follow_up[i])) {M$time[i]=M$days_to_last_follow_up[i]} else {
if(!is.na(M$days_to_follow_up[i])) {M$time[i]=M$days_to_follow_up[i]}
}
}
}
M[1:30,]
## case_id days_to_follow_up
## 1 00318ddd-89bc-4a35-af79-5ab407fbd278 2976
## 2 005c990a-3b38-521a-b849-0bd12b23043e <NA>
## 3 00614f9b-1f5f-5037-bd29-bb0bb2c82426 3284
## 4 006a93d5-096f-5c34-9948-c934157fdc58 <NA>
## 5 0074779b-5919-54ff-89f5-a26f42d7d0d9 <NA>
## 6 0079b2ff-6c93-47b0-95cb-4cc9b8915e12 1270
## 7 007ddc83-31c6-59f1-a093-a063b4806f96 2649
## 8 0084e8b6-57fc-48b6-aa77-fec6e45161d2 832
## 9 008ddf20-f7fb-4673-b2ed-b5d5212fbf09 59
## 10 0094e07c-1595-402e-9d38-68b9cac71e7b 494
## 11 00a65c23-8fa2-46b1-8cff-78652fa8e4ab 439
## 12 00b9bfd0-95c0-59c6-afa4-44d13563a7e6 17
## 13 00c0941a-ac70-5bb3-9777-c66e02d56138 208
## 14 00d795da-4d30-54dd-948d-3d2b3aba7beb 166
## 15 00d973d4-9eb1-4e77-a02b-ce4d954f9624 2077
## 16 00e49a81-96ab-5f9b-b28c-4bc9e9e65544 2055
## 17 00e9cfa7-d2e1-46b8-b0ff-d7f1da925097 1149
## 18 00ef4e4f-a44b-467c-923c-3c001abdaa5d 3625
## 19 00f67d32-46c5-43f2-8356-01bdc988bfaf 108
## 20 0113b21f-c0d9-5e77-89b1-57a78f0f9a1e 1
## 21 0116cc2c-50b9-4284-a257-386262715003 0
## 22 01189d9f-1be2-5db6-a45d-941d0ef3b1bb 3870
## 23 01205bd4-0e26-4d7d-811b-f51190bd2fa9 1344
## 24 0124279c-6b9f-5dbd-84e1-b6307f104021 3553
## 25 013f0b1d-8e7c-438c-b561-289a629c2962 2579
## 26 01489f5a-650e-4e15-aacf-a8463cbed03c 376
## 27 0165ada0-f393-4ffb-8f80-6d87f830584d 49
## 28 0175f673-c1d7-4d3c-b799-32c4110a351c 386
## 29 0175f734-c477-59e4-8eb9-cf12c4a0d078 <NA>
## 30 01795bc6-adc0-40a2-9880-6c99b93c0c5a 2677
## days_to_last_follow_up days_to_death time status
## 1 <NA> <NA> 2976 0
## 2 917.0 <NA> 917.0 0
## 3 <NA> <NA> 3284 0
## 4 3853.0 <NA> 3853.0 0
## 5 3259.0 <NA> 3259.0 0
## 6 <NA> 1270 1270 1
## 7 <NA> <NA> 2649 0
## 8 832.0 <NA> 832.0 0
## 9 <NA> <NA> 59 0
## 10 1753.0 1753 1753 1
## 11 <NA> <NA> 439 0
## 12 <NA> 17 17 1
## 13 <NA> <NA> 208 0
## 14 <NA> 166 166 1
## 15 <NA> <NA> 2077 0
## 16 <NA> <NA> 2055 0
## 17 1828.0 <NA> 1828.0 0
## 18 <NA> <NA> 3625 0
## 19 <NA> 230 230 1
## 20 <NA> 1 1 1
## 21 <NA> <NA> 0 0
## 22 <NA> <NA> 3870 0
## 23 <NA> <NA> 1344 0
## 24 <NA> <NA> 3553 0
## 25 <NA> <NA> 2579 0
## 26 425.0 425 425 1
## 27 877.0 <NA> 877.0 0
## 28 <NA> <NA> 386 0
## 29 533.0 533 533 1
## 30 <NA> <NA> 2677 0
length(which(is.na(M$time))) # checking if there are any NA values in the time column that we just coded
## [1] 0
NOTCH1 = read.delim("~/Desktop/GDCdata/NOTCH1_mutation/clinical.cohort.2025-03-20/clinical.tsv", sep = "\t", header = TRUE, fill = TRUE)
NOTCH1$mutant=1 # indicator
NOTCH1=NOTCH1[,c("case_id","mutant")] # too many columns, lets just pull out the Id and NOTCH1 indicator
NOTCH1=NOTCH1[c(which(!duplicated(NOTCH1$case_id))),]
dim(NOTCH1) # n=181, same as whats showing on the GDC
## [1] 181 2
# now merge this in with the follow-up data
dim(M); M=merge(M,NOTCH1,by="case_id",all=TRUE); dim(M)
## [1] 6501 6
## [1] 6505 7
# there were slightly more (n=4) after the merge - this means we have 4 individuals with missing followup data
M=M[c(which(!is.na(M$status))),] # remove NA values that have popped up in the merge
M$mutant[c(which(is.na(M$mutant)))]=0
table(M$mutant)
##
## 0 1
## 6324 177
M$time=as.numeric(M$time)
M[1:30,]
## case_id days_to_follow_up
## 1 00318ddd-89bc-4a35-af79-5ab407fbd278 2976
## 2 005c990a-3b38-521a-b849-0bd12b23043e <NA>
## 3 00614f9b-1f5f-5037-bd29-bb0bb2c82426 3284
## 4 006a93d5-096f-5c34-9948-c934157fdc58 <NA>
## 5 0074779b-5919-54ff-89f5-a26f42d7d0d9 <NA>
## 6 0079b2ff-6c93-47b0-95cb-4cc9b8915e12 1270
## 7 007ddc83-31c6-59f1-a093-a063b4806f96 2649
## 8 0084e8b6-57fc-48b6-aa77-fec6e45161d2 832
## 9 008ddf20-f7fb-4673-b2ed-b5d5212fbf09 59
## 10 0094e07c-1595-402e-9d38-68b9cac71e7b 494
## 11 00a65c23-8fa2-46b1-8cff-78652fa8e4ab 439
## 12 00b9bfd0-95c0-59c6-afa4-44d13563a7e6 17
## 13 00c0941a-ac70-5bb3-9777-c66e02d56138 208
## 14 00d795da-4d30-54dd-948d-3d2b3aba7beb 166
## 15 00d973d4-9eb1-4e77-a02b-ce4d954f9624 2077
## 16 00e49a81-96ab-5f9b-b28c-4bc9e9e65544 2055
## 17 00e9cfa7-d2e1-46b8-b0ff-d7f1da925097 1149
## 18 00ef4e4f-a44b-467c-923c-3c001abdaa5d 3625
## 19 00f67d32-46c5-43f2-8356-01bdc988bfaf 108
## 20 0113b21f-c0d9-5e77-89b1-57a78f0f9a1e 1
## 21 0116cc2c-50b9-4284-a257-386262715003 0
## 22 01189d9f-1be2-5db6-a45d-941d0ef3b1bb 3870
## 23 01205bd4-0e26-4d7d-811b-f51190bd2fa9 1344
## 24 0124279c-6b9f-5dbd-84e1-b6307f104021 3553
## 25 013f0b1d-8e7c-438c-b561-289a629c2962 2579
## 26 01489f5a-650e-4e15-aacf-a8463cbed03c 376
## 27 0165ada0-f393-4ffb-8f80-6d87f830584d 49
## 28 0175f673-c1d7-4d3c-b799-32c4110a351c 386
## 29 0175f734-c477-59e4-8eb9-cf12c4a0d078 <NA>
## 30 01795bc6-adc0-40a2-9880-6c99b93c0c5a 2677
## days_to_last_follow_up days_to_death time status mutant
## 1 <NA> <NA> 2976 0 0
## 2 917.0 <NA> 917 0 0
## 3 <NA> <NA> 3284 0 0
## 4 3853.0 <NA> 3853 0 0
## 5 3259.0 <NA> 3259 0 0
## 6 <NA> 1270 1270 1 0
## 7 <NA> <NA> 2649 0 0
## 8 832.0 <NA> 832 0 0
## 9 <NA> <NA> 59 0 0
## 10 1753.0 1753 1753 1 0
## 11 <NA> <NA> 439 0 0
## 12 <NA> 17 17 1 0
## 13 <NA> <NA> 208 0 0
## 14 <NA> 166 166 1 0
## 15 <NA> <NA> 2077 0 0
## 16 <NA> <NA> 2055 0 0
## 17 1828.0 <NA> 1828 0 0
## 18 <NA> <NA> 3625 0 0
## 19 <NA> 230 230 1 0
## 20 <NA> 1 1 1 0
## 21 <NA> <NA> 0 0 0
## 22 <NA> <NA> 3870 0 0
## 23 <NA> <NA> 1344 0 0
## 24 <NA> <NA> 3553 0 0
## 25 <NA> <NA> 2579 0 0
## 26 425.0 425 425 1 0
## 27 877.0 <NA> 877 0 0
## 28 <NA> <NA> 386 0 0
## 29 533.0 533 533 1 0
## 30 <NA> <NA> 2677 0 0
library(ggsurvfit)
#dev.new(height=4, width=4)
survfit2(Surv(time, status) ~ mutant, data = M) %>%
ggsurvfit() +
labs(
x = "Days",
y = "Overall survival probability"
)
# this looks slightly different than online, notice the followup time...
# it is roughly 12 years in the GDC portal and 6500/365=~18 years with our extracted data!
# So we're including individuals who likely have incomplete follow-up time, which likely came from including the days_to_follow_up, when we should have just included the days_to_last_follow_up variable
# this will mean we will lose some individuals who don't have data for the days_to_last_follow_up variable
# dim(M) # reduces from n=6501...
M2=M[c(which( !is.na(M$days_to_death) | !is.na(M$days_to_last_follow_up) )),]
# dim(M2) # to n=2510
survfit2(Surv(time, status) ~ mutant, data = M2) %>%
ggsurvfit() +
labs(
x = "Days",
y = "Overall survival probability"
)
# this now looks much closer to what we see online in terms of follow-up time and differences between individuals with and without a NOTCH1 mutation
# we don't expect it to be exact, as some GDC data is restricted and will not be downloaded.
Here I use the TCGAbiolinks R package to download the clinical/follow-up data - seems to be the only one available for extracting this data
I also end up using the GenomicDataCommons R package to download gene, as the TCGAbiolinks did not seem well suited for this
TCGAbiolinks: An R/Bioconductor package for integrative analysis with GDC data
GenomicDataCommons: NIH / NCI Genomic Data Commons Access
In this example, let’s extract TCGA-specific data, and look individuals with a TP53 mutation
extract the follow-up data, days to death or days to last follow up
# I've already run this code chunk - the GDCquery() may take ~5-10 minutes to query so will not rerun at the tutorial today
# instead I've included some notes below, and will read in a pre-processed file below
library(TCGAbiolinks)
QUERY_clin <- GDCquery(
project = c("TCGA-SKCM","TCGA-BRCA","TCGA-KIRC","TCGA-LUAD","TCGA-LUSC"), data.category = "Clinical",
data.type = "Clinical Supplement",
data.format = "BCR Biotab",
#case_ids = Meta_ALL$case_id
) # this takes a few minutes to query
GDCdownload(QUERY_clin) # this may also take a few minutes if the first time downloaded
Clinical_Meta <- GDCprepare(QUERY_clin)
head(Clinical_Meta) # this will show some of the extracted data for the SKCM dataset
$clinical_patient_skcm
# A tibble: 472 × 65
bcr_patient_uuid bcr_patient_barcode form_completion_date prospective_collection retrospective_collec…¹ birth_days_to gender height_cm_at_diagnosis weight_kg_at_diagnosis race ethnicity history_other_malign…²
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 bcr_patient_uuid bcr_patient_barcode form_completion_date tissue_prospective_co… tissue_retrospective_… days_to_birth gender height weight race ethnicity other_dx
2 CDE_ID: CDE_ID:2003301 CDE_ID: CDE_ID:3088492 CDE_ID:3088528 CDE_ID:30082… CDE_I… CDE_ID:649 CDE_ID:651 CDE_… CDE_ID:2… CDE_ID:3382736
3 5564E6A7-2195-4B… TCGA-3N-A9WB 2014-5-29 YES NO -26176 MALE 175 78 WHITE NOT HISP… No
4 551E071A-C290-4B… TCGA-3N-A9WC 2014-5-29 YES NO -30286 MALE 183 68 WHITE NOT HISP… No
5 A29A20E3-5C2C-4F… TCGA-3N-A9WD 2014-5-29 YES NO -30163 MALE 183 116 WHITE NOT HISP… No
6 3DD5A206-D7F3-42… TCGA-BF-A1PU 2013-4-25 YES NO -17025 FEMALE 160 58 WHITE [Unknown] No
7 EFF78AF6-0F68-49… TCGA-BF-A1PV 2013-4-25 YES NO -27124 FEMALE 160 70 WHITE [Unknown] No
8 197AC33E-D5DF-40… TCGA-BF-A1PX 2013-4-25 YES NO -20626 MALE 175 78 WHITE NOT HISP… No
9 455F982C-A067-46… TCGA-BF-A1PZ 2013-4-25 YES NO -26240 FEMALE 163 56 WHITE NOT HISP… No
10 633DDC04-C424-42… TCGA-BF-A1Q0 2013-4-25 YES NO -29380 MALE 173 69 WHITE NOT HISP… No
# ℹ 462 more rows
# ℹ abbreviated names: ¹retrospective_collection, ²history_other_malignancy
# ℹ 53 more variables: history_neoadjuvant_treatment...13 <chr>, tumor_status <chr>, vital_status <chr>, last_contact_days_to <chr>, death_days_to <chr>, primary_melanoma_known_dx <chr>,
# primary_multiple_at_dx <chr>, primary_at_dx_count <chr>, primary_location <chr>, breslow_thickness_at_diagnosis <chr>, clark_level_at_diagnosis <chr>, primary_melanoma_tumor_ulceration <chr>,
# primary_melanoma_mitotic_rate <chr>, radiation_therapy_to_primary <chr>, initial_pathologic_dx_year <chr>, age_at_diagnosis <chr>, ajcc_staging_edition <chr>, ajcc_tumor_pathologic_pt <chr>,
# ajcc_nodes_pathologic_pn <chr>, ajcc_metastasis_pathologic_pm <chr>, ldh_level <chr>, ajcc_pathologic_tumor_stage <chr>, submitted_tumor_dx_days_to <chr>, submitted_tumor_site <chr>,
# metastatic_tumor_site <chr>, primary_melanoma_skin_type <chr>, prior_radiation_therapy <chr>, history_neoadjuvant_treatment...40 <chr>, history_neoadjuvant_tx_type <chr>, …
# ℹ Use `print(n = ...)` to see more rows
$clinical_radiation_skcm
# A tibble: 149 × 18
bcr_patient_uuid bcr_patient_barcode bcr_radiation_barcode bcr_radiation_uuid form_completion_date radiation_therapy_type radiation_therapy_site radiation_total_dose radiation_adjuvant_u…¹
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 bcr_patient_uuid bcr_patient_barcode bcr_radiation_barcode bcr_radiation_uuid form_completion_date radiation_type anatomic_treatment_si… radiation_dosage units
2 CDE_ID: CDE_ID:2003301 CDE_ID: CDE_ID: CDE_ID: CDE_ID:2842944 CDE_ID:2793522 CDE_ID:2721441 CDE_ID:61446
3 A29A20E3-5C2C-4F37-B93E-AE9EBC46EC53 TCGA-3N-A9WD TCGA-3N-A9WD-R60044 483202B0-173B-42D1-… 2014-5-29 External Distant site 4800 cGy
4 238111F9-A08B-49B4-9035-CC43026330E7 TCGA-D3-A1Q5 TCGA-D3-A1Q5-R32604 775FC64B-C831-4FA1-… 2012-10-2 External Regional site 30 Gy
5 DF6303CD-FFB5-422F-9865-B09029A6D54F TCGA-D3-A1Q7 TCGA-D3-A1Q7-R35145 B40C4808-DC62-47C3-… 2012-10-10 External Regional site 3000 cGy
6 7B7A49B5-5BE8-4966-8F3A-E6E01E694F86 TCGA-D3-A1Q8 TCGA-D3-A1Q8-R35146 1A047F7E-7143-4E7B-… 2012-10-10 External Regional site 3000 cGy
7 19322032-C144-4962-B886-37606DAD33B7 TCGA-D3-A2J7 TCGA-D3-A2J7-R35212 F0214A29-5353-4E57-… 2012-10-10 External Regional site 3000 cGy
8 E9E615A1-9323-49D3-904C-FCF4887FEA61 TCGA-D3-A2J9 TCGA-D3-A2J9-R35214 17926EC9-C3C7-49B9-… 2012-10-10 External Regional site 30 Gy
9 33D7CCCB-6349-4337-8C5B-6E3E8B3C7AF1 TCGA-D3-A2JA TCGA-D3-A2JA-R35215 9683BAE9-ED79-4C3D-… 2012-10-10 External Local Recurrence 35 Gy
10 3A27966B-2C4C-402F-B71B-DF7925C36A7C TCGA-D3-A2JH TCGA-D3-A2JH-R34873 E9C846D6-0486-42BD-… 2012-10-10 External Regional site 3000 cGy
# ℹ 139 more rows
# ℹ abbreviated name: ¹radiation_adjuvant_units
# ℹ 9 more variables: radiation_adjuvant_fractions_total <chr>, radiation_therapy_started_days_to <chr>, radiation_therapy_ongoing_indicator <chr>, radiation_therapy_ended_days_to <chr>,
# treatment_best_response <chr>, course_number <chr>, radiation_type_other <chr>, therapy_regimen <chr>, therapy_regimen_other <chr>
# ℹ Use `print(n = ...)` to see more rows
$clinical_follow_up_v2.0_skcm
# A tibble: 391 × 29
bcr_patient_uuid bcr_patient_barcode bcr_followup_barcode bcr_followup_uuid form_completion_date followup_lost_to radiation_treatment_…¹ pharmaceutical_tx_ad…² tumor_status vital_status last_contact_days_to
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 bcr_patient_uuid bcr_patient_barcode bcr_followup_barcode bcr_followup_uuid form_completion_date lost_follow_up radiation_therapy postoperative_rx_tx person_neop… vital_status days_to_last_follow…
2 CDE_ID: CDE_ID:2003301 CDE_ID: CDE_ID: CDE_ID: CDE_ID:61333 CDE_ID:2005312 CDE_ID:3397567 CDE_ID:2759… CDE_ID:5 CDE_ID:3008273
3 551E071A-C290-4B48-… TCGA-3N-A9WC TCGA-3N-A9WC-F67104 BA36CE25-3689-4E… 2014-11-4 NO NO NO WITH TUMOR Alive 2022
4 3DD5A206-D7F3-42F1-… TCGA-BF-A1PU TCGA-BF-A1PU-F68906 3FC09348-32EC-48… 2014-12-23 YES [Not Available] [Not Available] [Not Availa… [Not Availa… [Not Available]
5 EFF78AF6-0F68-49B9-… TCGA-BF-A1PV TCGA-BF-A1PV-F68907 C174F01D-D88D-49… 2014-12-23 YES [Not Available] [Not Available] [Not Availa… [Not Availa… [Not Available]
6 455F982C-A067-46AC-… TCGA-BF-A1PZ TCGA-BF-A1PZ-F68909 4B1D5ACA-C24D-4E… 2014-12-23 NO NO NO TUMOR FREE Alive 853
7 633DDC04-C424-42C8-… TCGA-BF-A1Q0 TCGA-BF-A1Q0-F68910 48394337-6B3D-46… 2014-12-23 NO NO NO TUMOR FREE Alive 831
8 62393DAB-AB41-4785-… TCGA-BF-A3DJ TCGA-BF-A3DJ-F65978 6863AADE-07FC-4A… 2014-12-23 YES [Not Available] [Not Available] [Not Availa… [Not Availa… [Not Available]
9 E9F0F030-D9AA-47C8-… TCGA-BF-A3DL TCGA-BF-A3DL-F68911 196EB9F7-39D2-40… 2014-12-23 NO NO NO TUMOR FREE Alive 769
10 1CA2EB9F-9496-450F-… TCGA-BF-A3DM TCGA-BF-A3DM-F68913 AE5E936F-7FDB-4C… 2014-12-23 NO NO NO TUMOR FREE Alive 601
# ℹ 381 more rows
# ℹ abbreviated names: ¹radiation_treatment_adjuvant, ²pharmaceutical_tx_adjuvant
# ℹ 18 more variables: death_days_to <chr>, subsequent_known_primaries <chr>, new_tumor_event_dx_indicator <chr>, new_tumor_event_dx_days_to <chr>, new_tumor_event_surgery <chr>,
# new_tumor_event_surgery_days_to <chr>, new_tumor_event_radiation_tx <chr>, new_tumor_event_pharmaceutical_tx <chr>, new_tumor_event_type <chr>, new_tumor_event_met_site <chr>,
# new_tumor_event_met_site_other <chr>, new_tumor_event_melanoma_location <chr>, melanoma_primary_count <chr>, new_tumor_event_melanoma_site <chr>, new_tumor_event_histo_type <chr>, new_tumor_event_site <chr>,
# followup_reason <chr>, nte_anatomic_site_count <chr>
# ℹ Use `print(n = ...)` to see more rows
$clinical_omf_v4.0_skcm
# A tibble: 32 × 30
bcr_patient_uuid bcr_patient_barcode bcr_omf_barcode bcr_omf_uuid form_completion_date malignancy_type other_malignancy_dx_…¹ surgery_indicator other_malignancy_sur…² other_malignancy_sur…³
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 bcr_patient_uuid bcr_patient_barcode bcr_omf_barcode bcr_omf_uuid form_completion_date malignancy_type days_to_other_maligna… surgery_indicator surgery_type days_to_surgical_rese…
2 CDE_ID: CDE_ID:2003301 CDE_ID: CDE_ID: CDE_ID: CDE_ID:3182890 CDE_ID:3462301 CDE_ID:3186538 CDE_ID:3462300 CDE_ID:3462302
3 2EC330F8-96E4-4270-AADC-563BB72F4B2C TCGA-D3-A2JE TCGA-D3-A2JE-O1… A6D7215E-8F… 2011-6-27 Synchronous Ma… [Not Available] [Not Available] [Not Available] 188
4 C58638ED-A38C-4494-A41E-6EE5283A7A59 TCGA-D3-A2JF TCGA-D3-A2JF-O1… F57B990D-2D… 2011-6-27 Prior Malignan… [Not Available] [Not Available] [Not Available] [Not Available]
5 C58638ED-A38C-4494-A41E-6EE5283A7A59 TCGA-D3-A2JF TCGA-D3-A2JF-O1… F84B2888-5F… 2011-6-27 Prior Malignan… [Not Available] [Not Available] [Not Available] [Not Available]
6 16182443-12A1-4B0F-A6AB-33C2942E2991 TCGA-D3-A3C8 TCGA-D3-A3C8-O1… EA5301AB-2D… 2011-9-11 Prior Malignan… [Not Available] [Not Available] [Not Available] [Not Available]
7 16182443-12A1-4B0F-A6AB-33C2942E2991 TCGA-D3-A3C8 TCGA-D3-A3C8-O1… 28E0743F-36… 2011-9-11 Prior Malignan… [Not Available] [Not Available] [Not Available] [Not Available]
8 0153F141-625E-4623-9F8A-296678002C63 TCGA-D3-A3ML TCGA-D3-A3ML-O1… C4E381E0-58… 2011-12-8 Prior Malignan… [Not Available] [Not Available] [Not Available] [Not Available]
9 0153F141-625E-4623-9F8A-296678002C63 TCGA-D3-A3ML TCGA-D3-A3ML-O1… 75353761-4D… 2011-12-8 Prior Malignan… [Not Available] [Not Available] [Not Available] [Not Available]
10 0153F141-625E-4623-9F8A-296678002C63 TCGA-D3-A3ML TCGA-D3-A3ML-O1… 33ED7364-ED… 2011-12-8 Prior Malignan… [Not Available] [Not Available] [Not Available] [Not Available]
# ℹ 22 more rows
# ℹ abbreviated names: ¹other_malignancy_dx_days_to, ²other_malignancy_surgery_type, ³other_malignancy_surgery_days_to
# ℹ 20 more variables: pharmaceutical_therapy_indicator <chr>, pharmaceutical_therapy_extent <chr>, pharmaceutical_therapy_drug_name <chr>, pharmaceutical_tx_started_days_to <chr>,
# radiation_therapy_indicator <chr>, radiation_therapy_extent <chr>, history_rt_tx_to_site_of_tcga_tumor <chr>, radiation_therapy_started_days_to <chr>, ajcc_staging_edition <chr>,
# ajcc_tumor_pathologic_pt <chr>, ajcc_nodes_pathologic_pn <chr>, ajcc_metastasis_pathologic_pm <chr>, ajcc_pathologic_tumor_stage <chr>, clinical_stage <chr>, other_malignancy_anatomic_site <chr>,
# other_malignancy_anatomic_site_text <chr>, other_malignancy_histological_type <chr>, other_malignancy_histological_type_text <chr>, other_malignancy_laterality <chr>, stage_other <chr>
# ℹ Use `print(n = ...)` to see more rows
$clinical_drug_skcm
# A tibble: 267 × 28
bcr_patient_uuid bcr_patient_barcode bcr_drug_barcode bcr_drug_uuid form_completion_date pharmaceutical_thera…¹ clinical_trial_drug_…² pharmaceutical_thera…³ pharmaceutical_tx_st…⁴ pharmaceutical_tx_on…⁵
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 bcr_patient_uuid bcr_patient_barcode bcr_drug_barcode bcr_drug_uuid form_completion_date drug_name clinical_trail_drug_c… therapy_type days_to_drug_therapy_… therapy_ongoing
2 CDE_ID: CDE_ID:2003301 CDE_ID: CDE_ID: CDE_ID: CDE_ID:2975232 CDE_ID:3378323 CDE_ID:2793530 CDE_ID:3392465 CDE_ID:3103479
3 8C54FCFD-999F-43C5-B31… TCGA-D3-A1Q1 TCGA-D3-A1Q1-D3… A837F724-544… 2012-10-2 Cyclophosphamide [Not Available] Chemotherapy 247 NO
4 8C54FCFD-999F-43C5-B31… TCGA-D3-A1Q1 TCGA-D3-A1Q1-D4… 18E0B29D-583… 2013-2-22 [Not Available] Melanoma-derived help… Immunotherapy 247 NO
5 8C54FCFD-999F-43C5-B31… TCGA-D3-A1Q1 TCGA-D3-A1Q1-D4… DFD48E5E-C36… 2013-2-22 [Not Available] Class I MHC restricte… Vaccine 247 NO
6 29476D6A-0D06-4A18-891… TCGA-D3-A1Q3 TCGA-D3-A1Q3-D3… 762FC34B-A29… 2013-1-7 Cisplatin [Not Available] Chemotherapy 362 NO
7 1E05DA2E-7DD7-4C98-9E7… TCGA-D3-A1Q6 TCGA-D3-A1Q6-D3… 8036ED31-A3E… 2012-10-2 Interferon [Not Available] Immunotherapy 111 NO
8 798074D3-0B84-4DD5-AA8… TCGA-D3-A1Q9 TCGA-D3-A1Q9-D3… 15D5B49C-2D3… 2012-10-10 Lupron [Not Available] Hormone Therapy 356 NO
9 798074D3-0B84-4DD5-AA8… TCGA-D3-A1Q9 TCGA-D3-A1Q9-D3… A441C4EF-216… 2012-10-10 GP-100 [Not Available] Vaccine 356 NO
10 FBEE9FE7-8BED-41A6-BCF… TCGA-D3-A1QA TCGA-D3-A1QA-D3… 22FADF39-879… 2012-10-2 recMAGE- A3 [Not Available] Vaccine 1485 NO
# ℹ 257 more rows
# ℹ abbreviated names: ¹pharmaceutical_therapy_drug_name, ²clinical_trial_drug_classification, ³pharmaceutical_therapy_type, ⁴pharmaceutical_tx_started_days_to, ⁵pharmaceutical_tx_ongoing_indicator
# ℹ 18 more variables: pharmaceutical_tx_ended_days_to <chr>, treatment_best_response <chr>, days_to_stem_cell_transplantation <chr>, pharm_regimen <chr>, pharm_regimen_other <chr>,
# pharma_adjuvant_cycles_count <chr>, pharma_type_other <chr>, pharmaceutical_tx_dose_units <chr>, pharmaceutical_tx_total_dose_units <chr>, prescribed_dose <chr>, regimen_number <chr>,
# route_of_administration <chr>, stem_cell_transplantation <chr>, stem_cell_transplantation_type <chr>, therapy_regimen <chr>, therapy_regimen_other <chr>, total_dose <chr>, tx_on_clinical_trial <chr>
# ℹ Use `print(n = ...)` to see more rows
$clinical_nte_skcm
# A tibble: 761 × 15
bcr_patient_uuid bcr_patient_barcode new_tumor_event_dx_d…¹ new_tumor_event_surg…² new_tumor_event_surg…³ new_tumor_event_radi…⁴ new_tumor_event_phar…⁵ new_tumor_event_type new_tumor_event_met_…⁶
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 bcr_patient_uuid bcr_patient_barcode days_to_new_tumor_eve… new_tumor_event_addit… days_to_new_tumor_eve… additional_radiation_… additional_pharmaceut… new_neoplasm_event_… new_tumor_metastasis_…
2 CDE_ID: CDE_ID:2003301 CDE_ID:3392464 CDE_ID:3427611 CDE_ID:3008335 CDE_ID:3427615 CDE_ID:3427616 CDE_ID:3119721 CDE_ID:3430936
3 5564E6A7-2195-4B0D-994E-B0617B… TCGA-3N-A9WB 426 YES 457 NO NO Locoregional Recurr… Other, Specify
4 5564E6A7-2195-4B0D-994E-B0617B… TCGA-3N-A9WB 457 NO [Not Available] NO NO Distant Metastasis Lung
5 5564E6A7-2195-4B0D-994E-B0617B… TCGA-3N-A9WB 457 NO [Not Available] NO NO Distant Metastasis Bone
6 5564E6A7-2195-4B0D-994E-B0617B… TCGA-3N-A9WB 487 NO [Not Available] NO NO Distant Metastasis Liver
7 5564E6A7-2195-4B0D-994E-B0617B… TCGA-3N-A9WB 487 NO [Not Available] YES NO Distant Metastasis Brain
8 551E071A-C290-4B48-9000-F64C2A… TCGA-3N-A9WC 1705 YES 1705 NO NO Distant Metastasis Lung
9 A29A20E3-5C2C-4F37-B93E-AE9EBC… TCGA-3N-A9WD 306 NO [Not Available] NO NO Distant Metastasis Brain
10 3DD5A206-D7F3-42F1-B9CC-4B31C7… TCGA-BF-A1PU 484 YES 484 NO YES Distant Metastasis Lung
# ℹ 751 more rows
# ℹ abbreviated names: ¹new_tumor_event_dx_days_to, ²new_tumor_event_surgery, ³new_tumor_event_surgery_days_to, ⁴new_tumor_event_radiation_tx, ⁵new_tumor_event_pharmaceutical_tx, ⁶new_tumor_event_met_site
# ℹ 6 more variables: new_tumor_event_met_site_other <chr>, new_tumor_event_melanoma_location <chr>, new_tumor_event_melanoma_site <chr>, new_tumor_event_histo_type <chr>, new_tumor_event_site <chr>,
# nte_anatomic_site_count <chr>
# ℹ Use `print(n = ...)` to see more rows
# we can use ls() to see all extracted datasets
ls(Clinical_Meta)
[1] "clinical_drug_brca" "clinical_drug_kirc" "clinical_drug_luad" "clinical_drug_lusc" "clinical_drug_skcm"
[6] "clinical_follow_up_v1.0_kirc" "clinical_follow_up_v1.0_luad" "clinical_follow_up_v1.0_lusc" "clinical_follow_up_v1.5_brca" "clinical_follow_up_v2.0_skcm"
[11] "clinical_follow_up_v2.1_brca" "clinical_follow_up_v4.0_brca" "clinical_follow_up_v4.0_nte_brca" "clinical_nte_brca" "clinical_nte_kirc"
[16] "clinical_nte_luad" "clinical_nte_lusc" "clinical_nte_skcm" "clinical_omf_v4.0_brca" "clinical_omf_v4.0_kirc"
[21] "clinical_omf_v4.0_luad" "clinical_omf_v4.0_lusc" "clinical_omf_v4.0_skcm" "clinical_patient_brca" "clinical_patient_kirc"
[26] "clinical_patient_luad" "clinical_patient_lusc" "clinical_patient_skcm" "clinical_radiation_brca" "clinical_radiation_kirc"
[31] "clinical_radiation_luad" "clinical_radiation_lusc" "clinical_radiation_skcm"
# Here I'm pulling out the 'clinical_patient..' data for each project - this has the days to death or last follow-up variables we need
IN=as.data.frame(Clinical_Meta$clinical_patient_skcm)
IN2=as.data.frame(Clinical_Meta$clinical_patient_brca)
IN3=as.data.frame(Clinical_Meta$clinical_patient_kirc)
IN4=as.data.frame(Clinical_Meta$clinical_patient_luad)
IN5=as.data.frame(Clinical_Meta$clinical_patient_lusc)
# note that the different projects (e.g. SKCM, BRCA) may have slightly different variables
> dim(IN)
[1] 472 65 -> 65 columns or variables
> dim(IN2)
[1] 1099 112 -> columns or variables
> dim(IN3)
[1] 539 61
> dim(IN4)
[1] 524 123
> dim(IN5)
[1] 506 107
# some of these variables might be interesting to investigate as covariates in out survival analysis (treatment type, tumour site etc)
# For this tutorial, we will just extract days to and gender variables
IN=IN[,c("bcr_patient_uuid","last_contact_days_to","death_days_to","gender")]; IN=IN[3:nrow(IN),] # remove first two rows
IN2=IN2[,c("bcr_patient_uuid","last_contact_days_to","death_days_to","gender")]; IN2=IN2[3:nrow(IN2),] # remove first two rows
IN3=IN3[,c("bcr_patient_uuid","last_contact_days_to","death_days_to","gender")]; IN3=IN3[3:nrow(IN3),] # remove first two rows
IN4=IN4[,c("bcr_patient_uuid","last_contact_days_to","death_days_to","gender")]; IN4=IN4[3:nrow(IN4),] # remove first two rows
IN5=IN5[,c("bcr_patient_uuid","last_contact_days_to","death_days_to","gender")]; IN5=IN5[3:nrow(IN5),] # remove first two rows
IN=rbind(IN,IN2,IN3,IN4,IN5)
write.csv(IN, "~/Desktop/GDCdata/R_query_extracted/GDC_days_data.csv",row.names=FALSE)
IN=read.csv("~/Desktop/GDCdata/R_query_extracted/GDC_days_data.csv")
# now need to recode data for survival analysis
# create new time variable, which is a merge between death_days_to and last_contact_days_to
IN$time=IN$death_days_to
IN$time[c(which(IN$time == "[Not Applicable]"))]=IN$last_contact_days_to[c(which(IN$time == "[Not Applicable]"))]
# create new binary status variable
#IN$status=as.numeric(IN$vital_status == "Dead")
IN$status=as.numeric(IN$death_days_to != "[Not Applicable]")
table(IN$status)
##
## 0 1
## 2420 710
# remove missing data
IN=IN[c(which(IN$time != "[Completed]")),]
IN=IN[c(which(IN$time != "[Not Available]")),]
# make sure time variable is numeric
IN$time=as.numeric(IN$time)
## Warning: NAs introduced by coercion
IN=IN[c(which(!is.na(IN$time))),]
# also need to convert patient Id's to lower case, to match the lower case of the patient Id's when using the 'export cohort' function for specific mutations
IN$bcr_patient_uuid=tolower(IN$bcr_patient_uuid)
#dim(IN) # 3090
head(IN); dim(IN)
## bcr_patient_uuid last_contact_days_to death_days_to
## 1 5564e6a7-2195-4b0d-994e-b0617b58e889 [Not Available] 518
## 2 551e071a-c290-4b48-9000-f64c2a44dfb7 1856 [Not Applicable]
## 3 a29a20e3-5c2c-4f37-b93e-ae9ebc46ec53 [Not Available] 395
## 4 3dd5a206-d7f3-42f1-b9cc-4b31c76d495d 387 [Not Applicable]
## 5 eff78af6-0f68-49b9-866b-0d511606f2b1 14 [Not Applicable]
## 6 197ac33e-d5df-40de-92f2-6ef7fe2ad6dd [Not Available] 282
## gender time status
## 1 MALE 518 1
## 2 MALE 1856 0
## 3 MALE 395 1
## 4 FEMALE 387 0
## 5 FEMALE 14 0
## 6 MALE 282 1
## [1] 3090 6
# -----------------------------------------------------
# TCGAbiolinks: Searching, downloading and visualizing mutation files
# see https://bioconductor.org/packages/devel/bioc/vignettes/TCGAbiolinks/inst/doc/mutation.html#Mutation_data_MC3_file
#QUERY <- GDCquery(
# project = c("TCGA-SKCM","TCGA-BRCA","TCGA-KIRC","TCGA-LUAD","TCGA-LUSC"),
# data.category = "Simple Nucleotide Variation",
# access = "open",
# data.type = "Masked Somatic Mutation",
# workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
#)
#GDCdownload(QUERY)
#maf <- GDCprepare(QUERY)
# multiple datasets - this download kept failing
# try a single dataset
#QUERY <- GDCquery(
# project = "TCGA-SKCM",
# data.category = "Simple Nucleotide Variation",
# access = "open",
# data.type = "Masked Somatic Mutation",
# workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
#)
#GDCdownload(QUERY)
#maf <- GDCprepare(QUERY)
# this worked, however this function downloads massive files that don't seem specific to mutations
# -----------------------------------------------------
# Alternative pathway to extract gene/mutation data - GenomicDataCommons package
# Working with simple somatic mutations
# see https://bioconductor.org/packages/release/bioc/vignettes/GenomicDataCommons/inst/doc/somatic_mutations.html
# this webpage, while it works, is not great in that it has no notes on each step
library(GenomicDataCommons)
library(tibble)
grep_fields('genes', 'symbol')
## [1] "symbol"
tp53 = genes() |>
GenomicDataCommons::filter(symbol=='TP53') |>
results(size=10000) |>
as_tibble()
ssms() |>
GenomicDataCommons::filter(
chromosome==paste0('chr',tp53$gene_chromosome[1]) &
start_position > tp53$gene_start[1] &
end_position < tp53$gene_end[1]) |>
GenomicDataCommons::count()
## [1] 1357
ssms() |>
GenomicDataCommons::filter(
consequence.transcript.gene.symbol %in% c('TP53')) |>
GenomicDataCommons::count()
## [1] 1355
library(VariantAnnotation)
vars = ssms() |>
GenomicDataCommons::filter(
consequence.transcript.gene.symbol %in% c('TP53')) |>
GenomicDataCommons::results_all() |>
as_tibble()
vr = VRanges(seqnames = vars$chromosome,
ranges = IRanges(start=vars$start_position, width=1),
ref = vars$reference_allele,
alt = vars$tumor_allele)
ssm_occurrences() |>
GenomicDataCommons::filter(
ssm.consequence.transcript.gene.symbol %in% c('TP53')) |>
GenomicDataCommons::count()
## [1] 5411
var_samples = ssm_occurrences() |>
GenomicDataCommons::filter(
ssm.consequence.transcript.gene.symbol %in% c('TP53')) |>
GenomicDataCommons::expand(c('case', 'ssm', 'case.project')) |>
GenomicDataCommons::results_all() |>
as_tibble()
# lets have a look at the data that was extracted
ls(var_samples)
## [1] "case" "id" "ssm"
## [4] "ssm_occurrence_id"
head(var_samples$case)
## primary_site
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 Breast
## acfa444f-c9e7-50ff-a0c5-2799b734f425 Esophagus
## 9cdeb222-2715-5877-af0d-74dcf1243fdf Bronchus and lung
## 89389d68-618c-5180-b200-e13c8c8b958c Ovary
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 Other and ill-defined sites in lip, oral cavity and pharynx
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 Bronchus and lung
## disease_type
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 Ductal and Lobular Neoplasms
## acfa444f-c9e7-50ff-a0c5-2799b734f425 Adenomas and Adenocarcinomas
## 9cdeb222-2715-5877-af0d-74dcf1243fdf Squamous Cell Neoplasms
## 89389d68-618c-5180-b200-e13c8c8b958c Cystic, Mucinous and Serous Neoplasms
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 Squamous Cell Neoplasms
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 Squamous Cell Neoplasms
## available_variation_data
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 cnv, ssm
## acfa444f-c9e7-50ff-a0c5-2799b734f425 cnv, ssm
## 9cdeb222-2715-5877-af0d-74dcf1243fdf cnv, ssm
## 89389d68-618c-5180-b200-e13c8c8b958c cnv, ssm
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 cnv, ssm
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 cnv, ssm
## case_id
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 db4bc6aa-2e7d-4bcb-8519-a455f624d33b
## acfa444f-c9e7-50ff-a0c5-2799b734f425 cd2f37a3-1912-4e64-89c5-a92a070efd5b
## 9cdeb222-2715-5877-af0d-74dcf1243fdf f18c47e1-516f-43f7-8d31-4e5012bd22f0
## 89389d68-618c-5180-b200-e13c8c8b958c c9226204-cb61-40b8-8a94-a7e71c14ea3a
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 3f2e0c8a-6d5f-4763-be60-5a70994d31a1
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 418a004e-d5a9-461b-8764-4b581b6b8223
## project.primary_site project.disease_type
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 Breast Epitheli....
## acfa444f-c9e7-50ff-a0c5-2799b734f425 Esophagu.... Squamous....
## 9cdeb222-2715-5877-af0d-74dcf1243fdf Bronchus.... Adenomas....
## 89389d68-618c-5180-b200-e13c8c8b958c Ovary, R.... Cystic, ....
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 Tonsil, .... Squamous....
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 Bronchus.... Adenomas....
## project.project_id
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 TCGA-BRCA
## acfa444f-c9e7-50ff-a0c5-2799b734f425 TCGA-ESCA
## 9cdeb222-2715-5877-af0d-74dcf1243fdf TCGA-LUSC
## 89389d68-618c-5180-b200-e13c8c8b958c TCGA-OV
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 TCGA-HNSC
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 TCGA-LUSC
## project.name
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 Breast Invasive Carcinoma
## acfa444f-c9e7-50ff-a0c5-2799b734f425 Esophageal Carcinoma
## 9cdeb222-2715-5877-af0d-74dcf1243fdf Lung Squamous Cell Carcinoma
## 89389d68-618c-5180-b200-e13c8c8b958c Ovarian Serous Cystadenocarcinoma
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 Head and Neck Squamous Cell Carcinoma
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 Lung Squamous Cell Carcinoma
## project.dbgap_accession_number
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 <NA>
## acfa444f-c9e7-50ff-a0c5-2799b734f425 <NA>
## 9cdeb222-2715-5877-af0d-74dcf1243fdf <NA>
## 89389d68-618c-5180-b200-e13c8c8b958c <NA>
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 <NA>
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 <NA>
## submitter_id index_date state
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 TCGA-BH-A18Q Diagnosis released
## acfa444f-c9e7-50ff-a0c5-2799b734f425 TCGA-V5-A7RB Diagnosis released
## 9cdeb222-2715-5877-af0d-74dcf1243fdf TCGA-63-A5MP Diagnosis released
## 89389d68-618c-5180-b200-e13c8c8b958c TCGA-24-2036 Diagnosis released
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 TCGA-CR-7386 Diagnosis released
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 TCGA-92-8063 Diagnosis released
## consent_type lost_to_followup
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 Consent by Death <NA>
## acfa444f-c9e7-50ff-a0c5-2799b734f425 Informed Consent No
## 9cdeb222-2715-5877-af0d-74dcf1243fdf Informed Consent <NA>
## 89389d68-618c-5180-b200-e13c8c8b958c Consent by Death <NA>
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 Informed Consent <NA>
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 Informed Consent No
## days_to_consent
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 NA
## acfa444f-c9e7-50ff-a0c5-2799b734f425 14
## 9cdeb222-2715-5877-af0d-74dcf1243fdf 36
## 89389d68-618c-5180-b200-e13c8c8b958c NA
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 42
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 -3
head(var_samples$id)
## [1] "f2b515f2-4524-515a-9e7d-77b0a69e7a28"
## [2] "acfa444f-c9e7-50ff-a0c5-2799b734f425"
## [3] "9cdeb222-2715-5877-af0d-74dcf1243fdf"
## [4] "89389d68-618c-5180-b200-e13c8c8b958c"
## [5] "c2f1cd44-fab0-55b1-ac88-9639d1d37223"
## [6] "2c9c0c71-a39d-5ff8-8cc1-f13549d363c0"
head(var_samples$ssm)
## start_position reference_allele ncbi_build
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 7676140 - GRCh38
## acfa444f-c9e7-50ff-a0c5-2799b734f425 7674220 C GRCh38
## 9cdeb222-2715-5877-af0d-74dcf1243fdf 7674921 C GRCh38
## 89389d68-618c-5180-b200-e13c8c8b958c 7673802 C GRCh38
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 7674899 GT GRCh38
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 7675149 T GRCh38
## cosmic_id mutation_subtype
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 COSM5833.... Small insertion
## acfa444f-c9e7-50ff-a0c5-2799b734f425 COSM1066.... Single base substitution
## 9cdeb222-2715-5877-af0d-74dcf1243fdf COSM1080.... Single base substitution
## 89389d68-618c-5180-b200-e13c8c8b958c COSM1066.... Single base substitution
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 COSM4522.... Small deletion
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 COSM1091.... Single base substitution
## mutation_type chromosome
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 Simple Somatic Mutation chr17
## acfa444f-c9e7-50ff-a0c5-2799b734f425 Simple Somatic Mutation chr17
## 9cdeb222-2715-5877-af0d-74dcf1243fdf Simple Somatic Mutation chr17
## 89389d68-618c-5180-b200-e13c8c8b958c Simple Somatic Mutation chr17
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 Simple Somatic Mutation chr17
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 Simple Somatic Mutation chr17
## ssm_id
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 5b38df58-5832-55da-ba7b-05089618de16
## acfa444f-c9e7-50ff-a0c5-2799b734f425 8d2dfec2-3a12-511c-90e9-3e29c039b548
## 9cdeb222-2715-5877-af0d-74dcf1243fdf 00452ceb-8656-5556-a064-99ed6019bc74
## 89389d68-618c-5180-b200-e13c8c8b958c b5249474-20f8-5245-8dc0-c548405baaa2
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 6e040409-6b17-5ffd-9618-8c4fb8fae24f
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 c6a2d78b-deb6-57a4-97fd-e77cf3a90ca6
## genomic_dna_change
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 chr17:g.7676140_7676141insTGCA
## acfa444f-c9e7-50ff-a0c5-2799b734f425 chr17:g.7674220C>T
## 9cdeb222-2715-5877-af0d-74dcf1243fdf chr17:g.7674921C>A
## 89389d68-618c-5180-b200-e13c8c8b958c chr17:g.7673802C>T
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 chr17:g.7674899delGT
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 chr17:g.7675149T>G
## tumor_allele end_position
## f2b515f2-4524-515a-9e7d-77b0a69e7a28 TGCA 7676141
## acfa444f-c9e7-50ff-a0c5-2799b734f425 T 7674220
## 9cdeb222-2715-5877-af0d-74dcf1243fdf A 7674921
## 89389d68-618c-5180-b200-e13c8c8b958c T 7673802
## c2f1cd44-fab0-55b1-ac88-9639d1d37223 - 7674900
## 2c9c0c71-a39d-5ff8-8cc1-f13549d363c0 G 7675149
head(var_samples$ssm_occurrence_id)
## [1] "f2b515f2-4524-515a-9e7d-77b0a69e7a28"
## [2] "acfa444f-c9e7-50ff-a0c5-2799b734f425"
## [3] "9cdeb222-2715-5877-af0d-74dcf1243fdf"
## [4] "89389d68-618c-5180-b200-e13c8c8b958c"
## [5] "c2f1cd44-fab0-55b1-ac88-9639d1d37223"
## [6] "2c9c0c71-a39d-5ff8-8cc1-f13549d363c0"
# checked the range of genomic positions in var_samples$ssm and they are all TP53
# var_samples$case also has primary site and project if need to cross reference
OUT=var_samples$case
OUT=OUT[,c("case_id","primary_site")]
OUT$TP53=1
# there are some duplicates in this dataframe to be removed, e.g.
#> var_samples$case[c(which(var_samples$case$case_id == "63d646e2-60aa-4bd5-b2ed-6254c9c51be4")),]
# primary_site disease_type available_variation_data case_id project.primary_site project.disease_type project.project_id project.name
#12de3c3e-89e9-5573-9de5-51f1a9c3a623 Floor of mouth Squamous Cell Neoplasms cnv, ssm 63d646e2-60aa-4bd5-b2ed-6254c9c51be4 Tonsil, .... Squamous.... TCGA-HNSC Head and Neck Squamous Cell Carcinoma
#7d4aaa9c-e501-5415-b063-86be2608a768 Floor of mouth Squamous Cell Neoplasms cnv, ssm 63d646e2-60aa-4bd5-b2ed-6254c9c51be4 Tonsil, .... Squamous.... TCGA-HNSC Head and Neck Squamous Cell Carcinoma
# project.dbgap_accession_number submitter_id index_date state consent_type lost_to_followup days_to_consent
#12de3c3e-89e9-5573-9de5-51f1a9c3a623 <NA> TCGA-QK-A6VB Diagnosis released Informed Consent No -12
#7d4aaa9c-e501-5415-b063-86be2608a768 <NA> TCGA-QK-A6VB Diagnosis released Informed Consent No -12
OUT=OUT[c(which(!duplicated(OUT$case_id))),]
# can now merge with the data extracted above
INm=merge(IN,OUT,by.x="bcr_patient_uuid",by.y="case_id",all=TRUE)
INm=INm[c(which(!is.na(INm$status))),]
INm$TP53[c(which(is.na(INm$TP53)))]=0
table(INm$TP53) # 1053 with a TP53 mutation
##
## 0 1
## 2037 1053
dim(INm) # 3090 individuals
## [1] 3090 8
# The numbers on the GDC are 3194 individuals and 1097 with a TP53 mutation
# now we can run the survival analysis
library(ggsurvfit)
#dev.new(height=4, width=4)
survfit2(Surv(time, status) ~ TP53, data = INm) %>%
ggsurvfit() +
labs(
x = "Days",
y = "Overall survival probability"
)
# put the x-axis in years
# Since there are approximately 365.25 days in a year (accounting for leap years), divide the x-axis values by 365.25
survfit2(Surv(time, status) ~ TP53, data = INm) %>%
ggsurvfit() +
labs(
x = "Years",
y = "Overall survival probability"
) +
scale_x_continuous(
breaks = seq(0, max(INm$time, na.rm = TRUE), by = 5 * 365.25),
labels = scales::number_format(scale = 1/365.25, accuracy = 1)
)
# any difference in males vs females?
library(ggsurvfit)
#dev.new(height=4, width=4)
survfit2(Surv(time, status) ~ gender, data = INm) %>%
ggsurvfit() +
labs(
x = "Days",
y = "Overall survival probability"
)
# switching to coxph for multivariable model
library(survival)
cox <- coxph(Surv(time, status) ~ TP53, data = INm); summary(cox)
## Call:
## coxph(formula = Surv(time, status) ~ TP53, data = INm)
##
## n= 3090, number of events= 700
##
## coef exp(coef) se(coef) z Pr(>|z|)
## TP53 0.42151 1.52426 0.07877 5.351 8.74e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## TP53 1.524 0.6561 1.306 1.779
##
## Concordance= 0.556 (se = 0.011 )
## Likelihood ratio test= 27.41 on 1 df, p=2e-07
## Wald test = 28.64 on 1 df, p=9e-08
## Score (logrank) test = 29.06 on 1 df, p=7e-08
cox <- coxph(Surv(time, status) ~ TP53 + as.factor(gender), data = INm); summary(cox)
## Call:
## coxph(formula = Surv(time, status) ~ TP53 + as.factor(gender),
## data = INm)
##
## n= 3090, number of events= 700
##
## coef exp(coef) se(coef) z Pr(>|z|)
## TP53 0.43294 1.54179 0.07881 5.494 3.94e-08 ***
## as.factor(gender)MALE 0.43919 1.55146 0.07604 5.776 7.65e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## TP53 1.542 0.6486 1.321 1.799
## as.factor(gender)MALE 1.551 0.6446 1.337 1.801
##
## Concordance= 0.599 (se = 0.012 )
## Likelihood ratio test= 60.92 on 2 df, p=6e-14
## Wald test = 61.99 on 2 df, p=3e-14
## Score (logrank) test = 62.91 on 2 df, p=2e-14